#load my standard toolbox
suppressWarnings(suppressMessages(library(tidyverse))) #required
suppressWarnings(suppressMessages(library(stringr)))
suppressWarnings(suppressMessages(library(lubridate)))
suppressWarnings(suppressMessages(library(forcats)))
suppressWarnings(suppressMessages(library(ggthemes)))
suppressWarnings(suppressMessages(library(reshape2)))
suppressPackageStartupMessages(library(dataMaid))
suppressPackageStartupMessages(library(ggplot2))
#extra tooling that is useful
suppressWarnings(suppressMessages(library(Hmisc))) #required
suppressWarnings(suppressMessages(library(pastecs))) #required
suppressWarnings(suppressMessages(library(stats))) #required
#some extras for mapping
suppressWarnings(suppressMessages(library(usmap))) #required
suppressWarnings(suppressMessages(library(maps)))
suppressWarnings(suppressMessages(library(mapdata)))
suppressWarnings(suppressMessages(library(ggmap)))
suppressWarnings(suppressMessages(library(ggrepel)))
suppressWarnings(suppressMessages(library(devtools)))
suppressWarnings(suppressMessages(library(knitr)))
suppressWarnings(suppressMessages(library(tcltk)))
suppressWarnings(suppressMessages(library(aplpack)))
suppressWarnings(suppressMessages(library(scales)))
#set print options
options(max.print=1000)
options(tibble.print_max = 1000, tibble.print_min = 1000)
#Session Info
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14.3
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] tcltk stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] scales_1.0.0 aplpack_1.3.2 knitr_1.21 usethis_1.4.0
## [5] devtools_2.0.1 ggrepel_0.8.0 ggmap_3.0.0 mapdata_2.3.0
## [9] maps_3.3.0 usmap_0.4.0 pastecs_1.3.21 Hmisc_4.2-0
## [13] Formula_1.2-3 survival_2.42-3 lattice_0.20-35 dataMaid_1.2.0
## [17] reshape2_1.4.3 ggthemes_4.0.1 lubridate_1.7.4 forcats_0.4.0
## [21] stringr_1.4.0 dplyr_0.8.0.1 purrr_0.3.0 readr_1.3.1
## [25] tidyr_0.8.2 tibble_2.0.1 ggplot2_3.1.0 tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-137 fs_1.2.6 bitops_1.0-6
## [4] RColorBrewer_1.1-2 httr_1.4.0 rprojroot_1.3-2
## [7] tools_3.5.1 backports_1.1.3 R6_2.4.0
## [10] rpart_4.1-13 lazyeval_0.2.1 colorspace_1.4-0
## [13] nnet_7.3-12 withr_2.1.2 tidyselect_0.2.5
## [16] gridExtra_2.3 prettyunits_1.0.2 processx_3.2.1
## [19] compiler_3.5.1 cli_1.0.1 rvest_0.3.2
## [22] htmlTable_1.13.1 xml2_1.2.0 desc_1.2.0
## [25] checkmate_1.9.1 DEoptimR_1.0-8 robustbase_0.93-3
## [28] callr_3.1.1 digest_0.6.18 foreign_0.8-70
## [31] rmarkdown_1.11 base64enc_0.1-3 jpeg_0.1-8
## [34] pkgconfig_2.0.2 htmltools_0.3.6 sessioninfo_1.1.1
## [37] htmlwidgets_1.3 rlang_0.3.1 readxl_1.3.0
## [40] rstudioapi_0.9.0 generics_0.0.2 jsonlite_1.6
## [43] acepack_1.4.1 magrittr_1.5 Matrix_1.2-14
## [46] Rcpp_1.0.0 munsell_0.5.0 stringi_1.3.1
## [49] yaml_2.2.0 pkgbuild_1.0.2 plyr_1.8.4
## [52] grid_3.5.1 crayon_1.3.4 haven_2.0.0
## [55] splines_3.5.1 pander_0.6.3 hms_0.4.2
## [58] ps_1.3.0 pillar_1.3.1 boot_1.3-20
## [61] rjson_0.2.20 pkgload_1.0.2 glue_1.3.0
## [64] evaluate_0.13 latticeExtra_0.6-28 remotes_2.0.2
## [67] data.table_1.12.0 modelr_0.1.4 png_0.1-7
## [70] RgoogleMaps_1.4.3 cellranger_1.1.0 gtable_0.2.0
## [73] assertthat_0.2.0 xfun_0.4 broom_0.5.1
## [76] memoise_1.1.0 cluster_2.0.7-1
#set your working directory for this code to work properly
root_dir<-"/Users/chandlervaughn/Dropbox/4. Chandler/Development/git_repositories/r_mustangs/Case1"
setwd(root_dir)
#setwd("/Users/Dhyan/Desktop/DoingDataScience/r_mustangs/Case1")
#echo `pwd`/`ls beers.csv`
#/Users/chandlervaughn/Dropbox/4. Chandler/Development/git_repositories/r_mustangs/Case1/Data/beers.csv
#echo `pwd`/`ls breweries.csv`
#/Users/chandlervaughn/Dropbox/4. Chandler/Development/git_repositories/r_mustangs/Case1/Data/Breweries.csv
####################################
## IMPORT RAW DATASETS (being explicit with col_types after inspecting the data)
####################################
beers <- read_csv(file = "Data/Beers.csv",
col_types = cols(
Name = col_character(),
Beer_ID = col_integer(),
ABV = col_double(),
IBU = col_integer(),
Brewery_id = col_integer(),
Style = col_character(),
Ounces = col_double()
)
)
breweries <- read_csv(file = "Data/Breweries.csv",
col_types = cols(
Brew_ID = col_integer(),
Name = col_character(),
City = col_character(),
State = col_character()
)
)
####################################
## RUN DATAMAID ANALYSIS REPORTS ON RAW DATA
####################################
setwd("Analysis")
makeDataReport(beers, replace=TRUE)
makeDataReport(breweries, replace=TRUE)
setwd(root_dir)
#check for problems
problems(beers)
## [1] row col expected actual
## <0 rows> (or 0-length row.names)
problems(breweries)
## [1] row col expected actual
## <0 rows> (or 0-length row.names)
#summary of tables
kable(summary(beers))
| Name | Beer_ID | ABV | IBU | Brewery_id | Style | Ounces | |
|---|---|---|---|---|---|---|---|
| Length:2410 | Min. : 1.0 | Min. :0.00100 | Min. : 4.00 | Min. : 1.0 | Length:2410 | Min. : 8.40 | |
| Class :character | 1st Qu.: 808.2 | 1st Qu.:0.05000 | 1st Qu.: 21.00 | 1st Qu.: 94.0 | Class :character | 1st Qu.:12.00 | |
| Mode :character | Median :1453.5 | Median :0.05600 | Median : 35.00 | Median :206.0 | Mode :character | Median :12.00 | |
| NA | Mean :1431.1 | Mean :0.05977 | Mean : 42.71 | Mean :232.7 | NA | Mean :13.59 | |
| NA | 3rd Qu.:2075.8 | 3rd Qu.:0.06700 | 3rd Qu.: 64.00 | 3rd Qu.:367.0 | NA | 3rd Qu.:16.00 | |
| NA | Max. :2692.0 | Max. :0.12800 | Max. :138.00 | Max. :558.0 | NA | Max. :32.00 | |
| NA | NA | NA’s :62 | NA’s :1005 | NA | NA | NA |
kable(summary(breweries))
| Brew_ID | Name | City | State | |
|---|---|---|---|---|
| Min. : 1.0 | Length:558 | Length:558 | Length:558 | |
| 1st Qu.:140.2 | Class :character | Class :character | Class :character | |
| Median :279.5 | Mode :character | Mode :character | Mode :character | |
| Mean :279.5 | NA | NA | NA | |
| 3rd Qu.:418.8 | NA | NA | NA | |
| Max. :558.0 | NA | NA | NA |
#we have some NAs to worry about in ABV and IBU. brew_id max equals breweryid max which is good
#Question 1 - First we will find out how many breweries are in each state.
#For safety lets trim out any bad characters and whitespace
breweries$State <- str_trim(breweries$State)
#Find the count of Breweries by State
(breweries_count <- breweries %>%
filter(breweries$State %in% state.abb) %>%
count(State))
## # A tibble: 50 x 2
## State n
## <chr> <int>
## 1 AK 7
## 2 AL 3
## 3 AR 2
## 4 AZ 11
## 5 CA 39
## 6 CO 47
## 7 CT 8
## 8 DE 2
## 9 FL 15
## 10 GA 7
## 11 HI 4
## 12 IA 5
## 13 ID 5
## 14 IL 18
## 15 IN 22
## 16 KS 3
## 17 KY 4
## 18 LA 5
## 19 MA 23
## 20 MD 7
## 21 ME 9
## 22 MI 32
## 23 MN 12
## 24 MO 9
## 25 MS 2
## 26 MT 9
## 27 NC 19
## 28 ND 1
## 29 NE 5
## 30 NH 3
## 31 NJ 3
## 32 NM 4
## 33 NV 2
## 34 NY 16
## 35 OH 15
## 36 OK 6
## 37 OR 29
## 38 PA 25
## 39 RI 5
## 40 SC 4
## 41 SD 1
## 42 TN 3
## 43 TX 28
## 44 UT 4
## 45 VA 16
## 46 VT 10
## 47 WA 23
## 48 WI 20
## 49 WV 1
## 50 WY 4
#Plot the number of Breweries for each State
#Show ggplot. Center title, with axis titles
ggplot(data = breweries_count, aes(x=State, y=n, fill=n)) +
geom_bar(stat = "identity", colour="black") +
geom_text(aes(label=n, size=5, vjust = -0.5, hjust= 0.5), show.legend = FALSE) +
ggtitle("Breweries By State") +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x="State", y="Breweries") +
guides(fill = guide_legend(title = "Breweries")) +
theme(plot.title = element_text(hjust = 0.5, face="bold")) +
theme(axis.title=element_text(face="bold"))
As show in the above graph, the State population of Craft breweries varies widely depending on locality. For example, you have disparities even from “the State next door” such as 39 Breweries in California, and only 2 in Nevada. It is anticipated that these differences are largely due to State and Local laws and restrictions, population density, as well as material supply-chain and local preference factors for Craft Beer Brewing.
#Question 2 - We will now merge the beer and breweries datasets and verify the merge.
#rename column in breweries for simplicity on IDs
breweries <- rename(breweries, Brewery_id=Brew_ID)
#Merge both tables
merged_beers <- merge(beers, breweries, by='Brewery_id')
#Fix redundant names
merged_beers <- rename(merged_beers, BeerName=Name.x)
merged_beers <- rename(merged_beers, BrewerName=Name.y)
#View the structure of the new table
str(merged_beers)
## 'data.frame': 2410 obs. of 10 variables:
## $ Brewery_id: int 1 1 1 1 1 1 2 2 2 2 ...
## $ BeerName : chr "Get Together" "Maggie's Leap" "Wall's End" "Pumpion" ...
## $ Beer_ID : int 2692 2691 2690 2689 2688 2687 2686 2685 2684 2683 ...
## $ ABV : num 0.045 0.049 0.048 0.06 0.06 0.056 0.08 0.125 0.077 0.042 ...
## $ IBU : int 50 26 19 38 25 47 68 80 25 42 ...
## $ Style : chr "American IPA" "Milk / Sweet Stout" "English Brown Ale" "Pumpkin Ale" ...
## $ Ounces : num 16 16 16 16 16 16 16 16 16 16 ...
## $ BrewerName: chr "NorthGate Brewing" "NorthGate Brewing" "NorthGate Brewing" "NorthGate Brewing" ...
## $ City : chr "Minneapolis" "Minneapolis" "Minneapolis" "Minneapolis" ...
## $ State : chr "MN" "MN" "MN" "MN" ...
summary(merged_beers)
## Brewery_id BeerName Beer_ID ABV
## Min. : 1.0 Length:2410 Min. : 1.0 Min. :0.00100
## 1st Qu.: 94.0 Class :character 1st Qu.: 808.2 1st Qu.:0.05000
## Median :206.0 Mode :character Median :1453.5 Median :0.05600
## Mean :232.7 Mean :1431.1 Mean :0.05977
## 3rd Qu.:367.0 3rd Qu.:2075.8 3rd Qu.:0.06700
## Max. :558.0 Max. :2692.0 Max. :0.12800
## NA's :62
## IBU Style Ounces BrewerName
## Min. : 4.00 Length:2410 Min. : 8.40 Length:2410
## 1st Qu.: 21.00 Class :character 1st Qu.:12.00 Class :character
## Median : 35.00 Mode :character Median :12.00 Mode :character
## Mean : 42.71 Mean :13.59
## 3rd Qu.: 64.00 3rd Qu.:16.00
## Max. :138.00 Max. :32.00
## NA's :1005
## City State
## Length:2410 Length:2410
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
#We will view box plots of ABV and IBU to explore the dataset
ggplot(data=merged_beers) +
geom_boxplot(mapping = aes(x=State, y=ABV, fill=State), outlier.colour="black") +
theme_classic() +
ggtitle("ABV By State") +
theme(plot.title = element_text(hjust = 0.5))
ggplot(data=merged_beers) +
geom_boxplot(mapping = aes(x=State, y=IBU, fill=State), outlier.colour="black") +
theme_classic() +
ggtitle("IBU By State") +
theme(plot.title = element_text(hjust = 0.5))
################
## Lets Make a Better BoxPlot For Use In Presentations
################
color_by_median <- merged_beers %>%
group_by(State) %>%
summarise(MedianABV = median(ABV, na.rm = TRUE), MedianIBU = median(IBU, na.rm = TRUE))
color_by_median <- left_join(color_by_median, merged_beers, by = c("State" = "State"))
ggplot(data=color_by_median) +
geom_boxplot(mapping = aes(x=State, y=ABV, fill=MedianIBU), outlier.colour="black") +
theme_classic() +
ggtitle("ABV By State") +
theme(plot.title = element_text(hjust = 0.5)) +
guides(fill = guide_legend(title = "Median IBU")) +
scale_y_continuous(labels = scales::percent, limits = c(NA, max(color_by_median$ABV, na.rm=TRUE)+0.005))
#ABV sorted by top median
ggplot(data=color_by_median) +
geom_boxplot(mapping = aes(x=reorder(color_by_median$State, color_by_median$MedianABV), y=ABV, fill=MedianIBU), outlier.colour="black") +
theme_classic() +
ggtitle("Sorted Median ABV By State") +
theme(plot.title = element_text(hjust = 0.5)) +
guides(fill = guide_legend(title = "Median IBU")) +
scale_y_continuous(labels = scales::percent, limits = c(NA, max(color_by_median$ABV, na.rm=TRUE)+0.005)) +
labs(x="State", y="ABV")
ggplot(data=color_by_median) +
geom_boxplot(mapping = aes(x=State, y=IBU, fill=MedianABV), outlier.colour="black") +
theme_classic() +
ggtitle("IBU By State") +
theme(plot.title = element_text(hjust = 0.5)) +
guides(fill = guide_legend(title = "Median ABV")) +
scale_y_continuous(limits = c(NA, max(color_by_median$IBU, na.rm=TRUE)+0.005)) +
scale_fill_gradient( labels = percent)
#IBU sorted by top median
ggplot(data=color_by_median) +
geom_boxplot(mapping = aes(x=reorder(color_by_median$State, color_by_median$MedianIBU), y=IBU, fill=MedianABV), outlier.colour="black") +
theme_classic() +
ggtitle("Sorted Median IBU By State") +
theme(plot.title = element_text(hjust = 0.5)) +
guides(fill = guide_legend(title = "Median ABV")) +
scale_y_continuous(limits = c(NA, max(color_by_median$IBU, na.rm=TRUE)+0.005)) +
scale_fill_gradient( labels = percent) +
labs(x="State", y="IBU")
#Draw a line at 50 so we can determine High Versus Low
#IBU sorted by top median w/line
ggplot(data=color_by_median) +
geom_boxplot(mapping = aes(x=reorder(color_by_median$State, color_by_median$MedianIBU), y=IBU, fill=MedianABV), outlier.colour="black") +
geom_abline(intercept = 50, slope = 0, na.rm=TRUE, linetype="dashed", size=1.5, col = "Red3") +
theme_classic() +
ggtitle("Sorted 50< Median IBU >50 By State") +
theme(plot.title = element_text(hjust = 0.5)) +
guides(fill = guide_legend(title = "Median ABV")) +
scale_y_continuous(limits = c(NA, max(color_by_median$IBU, na.rm=TRUE)+0.005)) +
scale_fill_gradient( labels = percent) +
labs(x="State", y="IBU") +
annotate("text", x=5, y=4, label = "<50 IBU = Malty", size=5, fontface = "bold", col="Red3") +
annotate("text", x=5, y=120, label = ">50 IBU = Hoppy", size=5, fontface = "bold", col = "Green4")
#And we will find the count of brews by state
(brews_count <- merged_beers %>%
group_by(State) %>%
count(State))
## # A tibble: 51 x 2
## # Groups: State [51]
## State n
## <chr> <int>
## 1 AK 25
## 2 AL 10
## 3 AR 5
## 4 AZ 47
## 5 CA 183
## 6 CO 265
## 7 CT 27
## 8 DC 8
## 9 DE 2
## 10 FL 58
## 11 GA 16
## 12 HI 27
## 13 IA 30
## 14 ID 30
## 15 IL 91
## 16 IN 139
## 17 KS 23
## 18 KY 21
## 19 LA 19
## 20 MA 82
## 21 MD 21
## 22 ME 27
## 23 MI 162
## 24 MN 55
## 25 MO 42
## 26 MS 11
## 27 MT 40
## 28 NC 59
## 29 ND 3
## 30 NE 25
## 31 NH 8
## 32 NJ 8
## 33 NM 14
## 34 NV 11
## 35 NY 74
## 36 OH 49
## 37 OK 19
## 38 OR 125
## 39 PA 100
## 40 RI 27
## 41 SC 14
## 42 SD 7
## 43 TN 6
## 44 TX 130
## 45 UT 26
## 46 VA 40
## 47 VT 27
## 48 WA 68
## 49 WI 87
## 50 WV 2
## 51 WY 15
###################
## THIS CODE IS INCLUDED TO CREATE BAGPLOT FOR VISUALS
## PURELY FOR EXPLORATORY DATA ANALYSIS AND POTENTIAL PRESENTATION
###################
StatBag <- ggproto("Statbag", Stat,
compute_group = function(data, scales, prop = 0.5) {
#################################
#################################
# originally from aplpack package, plotting functions removed
plothulls_ <- function(x, y, fraction, n.hull = 1,
col.hull, lty.hull, lwd.hull, density=0, ...){
# function for data peeling:
# x,y : data
# fraction.in.inner.hull : max percentage of points within the hull to be drawn
# n.hull : number of hulls to be plotted (if there is no fractiion argument)
# col.hull, lty.hull, lwd.hull : style of hull line
# plotting bits have been removed, BM 160321
# pw 130524
if(ncol(x) == 2){ y <- x[,2]; x <- x[,1] }
n <- length(x)
if(!missing(fraction)) { # find special hull
n.hull <- 1
if(missing(col.hull)) col.hull <- 1
if(missing(lty.hull)) lty.hull <- 1
if(missing(lwd.hull)) lwd.hull <- 1
x.old <- x; y.old <- y
idx <- chull(x,y); x.hull <- x[idx]; y.hull <- y[idx]
for( i in 1:(length(x)/3)){
x <- x[-idx]; y <- y[-idx]
if( (length(x)/n) < fraction ){
return(cbind(x.hull,y.hull))
}
idx <- chull(x,y); x.hull <- x[idx]; y.hull <- y[idx];
}
}
if(missing(col.hull)) col.hull <- 1:n.hull
if(length(col.hull)) col.hull <- rep(col.hull,n.hull)
if(missing(lty.hull)) lty.hull <- 1:n.hull
if(length(lty.hull)) lty.hull <- rep(lty.hull,n.hull)
if(missing(lwd.hull)) lwd.hull <- 1
if(length(lwd.hull)) lwd.hull <- rep(lwd.hull,n.hull)
result <- NULL
for( i in 1:n.hull){
idx <- chull(x,y); x.hull <- x[idx]; y.hull <- y[idx]
result <- c(result, list( cbind(x.hull,y.hull) ))
x <- x[-idx]; y <- y[-idx]
if(0 == length(x)) return(result)
}
result
} # end of definition of plothulls
#################################
# prepare data to go into function below
the_matrix <- matrix(data = c(data$x, data$y), ncol = 2)
# get data out of function as df with names
setNames(data.frame(plothulls_(the_matrix, fraction = prop)), nm = c("x", "y"))
# how can we get the hull and loop vertices passed on also?
},
required_aes = c("x", "y")
)
#' @inheritParams ggplot2::stat_identity
#' @param prop Proportion of all the points to be included in the bag (default is 0.5)
stat_bag <- function(mapping = NULL, data = NULL, geom = "polygon",
position = "identity", na.rm = FALSE, show.legend = NA,
inherit.aes = TRUE, prop = 0.5, alpha = 0.3, ...) {
layer(
stat = StatBag, data = data, mapping = mapping, geom = geom,
position = position, show.legend = show.legend, inherit.aes = inherit.aes,
params = list(na.rm = na.rm, prop = prop, alpha = alpha, ...)
)
}
geom_bag <- function(mapping = NULL, data = NULL,
stat = "identity", position = "identity",
prop = 0.5,
alpha = 0.3,
...,
na.rm = FALSE,
show.legend = NA,
inherit.aes = TRUE) {
layer(
data = data,
mapping = mapping,
stat = StatBag,
geom = GeomBag,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(
na.rm = na.rm,
alpha = alpha,
prop = prop,
...
)
)
}
#' @rdname ggplot2-ggproto
#' @format NULL
#' @usage NULL
#' @export
GeomBag <- ggproto("GeomBag", Geom,
draw_group = function(data, panel_scales, coord) {
n <- nrow(data)
if (n == 1) return(zeroGrob())
munched <- coord_munch(coord, data, panel_scales)
# Sort by group to make sure that colors, fill, etc. come in same order
munched <- munched[order(munched$group), ]
# For gpar(), there is one entry per polygon (not one entry per point).
# We'll pull the first value from each group, and assume all these values
# are the same within each group.
first_idx <- !duplicated(munched$group)
first_rows <- munched[first_idx, ]
ggplot2:::ggname("geom_bag",
grid:::polygonGrob(munched$x, munched$y, default.units = "native",
id = munched$group,
gp = grid::gpar(
col = first_rows$colour,
fill = alpha(first_rows$fill, first_rows$alpha),
lwd = first_rows$size * .pt,
lty = first_rows$linetype
)
)
)
},
default_aes = aes(colour = "NA", fill = "grey20", size = 0.5, linetype = 1,
alpha = NA, prop = 0.5),
handle_na = function(data, params) {
data
},
required_aes = c("x", "y"),
draw_key = draw_key_polygon
)
#create a clean space to work
bag_of_beers <- merged_beers
#find styles
bag_of_beers$Style <- str_trim(bag_of_beers$Style)
bag_of_beers$Style <- str_replace_all(bag_of_beers$Style, "[[:punct:]]", "")
style_counts<- bag_of_beers %>%
group_by(Style) %>%
count
style_counts<-style_counts[order(-style_counts$n), ]
#Find Quartiles
bag_of_beers_summary <- summary(bag_of_beers$ABV)
ABV_Q1 <- bag_of_beers_summary[2]
ABV_Q2 <- bag_of_beers_summary[3]
ABV_Q3 <- bag_of_beers_summary[5]
bag_of_beers_summary <- summary(bag_of_beers$IBU)
IBU_Q1 <- bag_of_beers_summary[2]
IBU_Q2 <- bag_of_beers_summary[3]
IBU_Q3 <- bag_of_beers_summary[5]
#find top styles
STYLE_1 <- as.character(style_counts[1,1])
STYLE_2 <- as.character(style_counts[2,1])
STYLE_3 <- as.character(style_counts[3,1])
STYLE_1 <- str_replace_all(STYLE_1, "[[:punct:]]", "")
STYLE_2 <- str_replace_all(STYLE_2, "[[:punct:]]", "")
STYLE_3 <- str_replace_all(STYLE_3, "[[:punct:]]", "")
bag_of_beers$IBU_class <- ifelse(bag_of_beers$IBU<=IBU_Q1, "< 1st Quartile",
ifelse(bag_of_beers$IBU>IBU_Q1 & bag_of_beers$IBU<=IBU_Q2, "< 2nd Quartile (Median)",
ifelse(bag_of_beers$IBU>IBU_Q2 & bag_of_beers$IBU<=IBU_Q3, "< 3rd Quartile",
ifelse(bag_of_beers$IBU>IBU_Q3, "> 3rd Quartile", "none"))))
bag_of_beers$ABV_class <- ifelse(bag_of_beers$ABV<=ABV_Q1, "< 1st Quartile",
ifelse(bag_of_beers$ABV>ABV_Q1 & bag_of_beers$ABV<=ABV_Q2, "< 2nd Quartile (Median)",
ifelse(bag_of_beers$ABV>ABV_Q2 & bag_of_beers$ABV<=ABV_Q3, "< 3rd Quartile",
ifelse(bag_of_beers$ABV>ABV_Q3 & bag_of_beers$ABV<=1, "> 3rd Quartile", "None"))))
bag_of_beers$Style_class <- ifelse(str_detect(bag_of_beers$Style,STYLE_1)==TRUE, STYLE_1,
ifelse(str_detect(bag_of_beers$Style,STYLE_2)==TRUE, STYLE_2,
ifelse(str_detect(bag_of_beers$Style, STYLE_3)==TRUE, STYLE_3, NA)))
#find style medians for ABV and IBU
style_medians <- merged_beers %>%
group_by(Style) %>%
summarise(MedianABV = median(ABV, na.rm = TRUE), MedianIBU = median(IBU, na.rm = TRUE))
style_medians <- left_join(style_medians, style_counts, by = c("Style" = "Style"))
#bag_of_beers <- left_join(bag_of_beers, style_medians, by = c("Style" = "Style"))
#CREATE BAGPLOTS
ggplot(data=bag_of_beers, aes(x=IBU, y=ABV, fill=ABV_class)) +
geom_point() +
stat_bag(prop = 0.95) + # enclose 95% of points
stat_bag(prop = 0.5, alpha = 0.5) + # enclose 50% of points
stat_bag(prop = 0.05, alpha = 0.9) + # enclose 5% of points
ggtitle("ABV Class Groups") +
theme(plot.title = element_text(hjust = 0.5))
ggplot(data=bag_of_beers, aes(x=IBU, y=ABV, fill=IBU_class)) +
geom_point() +
stat_bag(prop = 0.95) + # enclose 95% of points
stat_bag(prop = 0.5, alpha = 0.5) + # enclose 50% of points
stat_bag(prop = 0.05, alpha = 0.9) + # enclose 5% of points
ggtitle("IBU Class Groups") +
theme(plot.title = element_text(hjust = 0.5))
ggplot(data=subset(bag_of_beers, !is.na(Style_class)), aes(x=IBU, y=ABV, fill=Style_class)) +
geom_point() +
stat_bag(prop = 0.95) + # enclose 95% of points
stat_bag(prop = 0.5, alpha = 0.5) + # enclose 50% of points
stat_bag(prop = 0.05, alpha = 0.9) + # enclose 5% of points
ggtitle("Top 3 Most Popular Style Class Groups") +
theme(plot.title = element_text(hjust = 0.5)) +
guides(fill = guide_legend(title = "Top 3 Styles"))
#plot of ABV vs IBU for the Top x Styles
#choose x
num_rows<-5
ggplot(data=head(style_medians[order(-style_medians$n),],num_rows),
aes(x=MedianIBU, y=MedianABV, size=n, fill=Style, label = Style)) +
geom_jitter(shape = 21) +
geom_text(aes(label=head(style_medians[order(-style_medians$n),],num_rows)$n), size=5
, position = position_dodge(width=0), vjust = -1, hjust= 0
#, position = position_nudge(y = 0.0015)
) +
ggtitle(paste0("ABV vs IBU for the Top ", num_rows," Most Popular Styles"))+
theme(plot.title = element_text(hjust = 1)) +
scale_y_continuous(labels = scales::percent, limits = c(NA, max(style_medians$MedianABV)+0.005)) +
theme(plot.title = element_text(hjust = 0.5, size = 15, face="bold")) +
theme(axis.text = element_text(size=15), axis.title=element_text(size=15,face="bold")) +
labs(x = "Median IBU", y="Median ABV") +
labs(caption = "Bubble size represents most popularly represented styles by number of beers in the provided data.") +
scale_size_continuous(range=c(1, 30)) +
scale_colour_continuous(guide = FALSE) +
guides(fill = guide_legend(title = "Beer Style"))
#choose x
num_rows<-10
ggplot(data=head(style_medians[order(-style_medians$n),],num_rows),
aes(x=MedianIBU, y=MedianABV, size=n, fill=Style, label = Style)) +
geom_jitter(shape = 21) +
geom_text(aes(label=head(style_medians[order(-style_medians$n),],num_rows)$n), size=5
, position = position_dodge(width=0), vjust = -0.5, hjust= 0.1
#, position = position_nudge(y = 0.0015)
) +
ggtitle(paste0("ABV vs IBU for the Top ", num_rows," Most Popular Styles"))+
theme(plot.title = element_text(hjust = 1)) +
scale_y_continuous(labels = scales::percent, limits = c(NA, max(style_medians$MedianABV)+0.005)) +
theme(plot.title = element_text(hjust = 0.5, size = 15, face="bold")) +
theme(axis.text = element_text(size=15), axis.title=element_text(size=15,face="bold")) +
labs(x = "Median IBU", y="Median ABV") +
labs(caption = "Bubble size represents most popularly represented styles by number of beers in the provided data.") +
scale_size_continuous(range=c(1, 30)) +
scale_colour_continuous(guide = FALSE) +
guides(fill = guide_legend(title = "Beer Style"))
#choose x
num_rows<-5
ggplot(data=head(style_medians[order(-style_medians$n),],num_rows),
aes(x=MedianIBU, y=MedianABV, size=n, fill=Style, label = Style)) +
geom_jitter(shape = 21) +
geom_text(aes(label=str_remove(head(style_medians[order(-style_medians$n),],num_rows)$Style, "American ")), size=5
, position = position_dodge(width=0), vjust = -.06, hjust= 0
#, position = position_nudge(y = 0.0015)
) +
ggtitle(paste0("ABV vs IBU for the Top ", num_rows," Most Popular Styles"))+
theme(plot.title = element_text(hjust = 1)) +
scale_y_continuous(labels = scales::percent, limits = c(NA, max(style_medians$MedianABV)+0.005)) +
theme(plot.title = element_text(hjust = 0.5, size = 15, face="bold")) +
theme(axis.text = element_text(size=15), axis.title=element_text(size=15,face="bold")) +
labs(x = "Median IBU", y="Median ABV") +
labs(caption = "Bubble size represents most popularly represented styles by number of beers in the provided data.") +
scale_size_continuous(range=c(1, 30)) +
scale_colour_continuous(guide = FALSE) +
guides(fill = guide_legend(title = "Beer Style"))
##################
#Plot facets to view by state
#this might be helpful to view gaps in market or saturation
ggplot(data = merged_beers) +
geom_jitter(mapping = aes(x=ABV, y=IBU)) +
facet_wrap(~ State, nrow = 13)
#Finally, lets print first 6 lines and last six lines
#so we can answer the question directly
kable(head(merged_beers, 6))
| Brewery_id | BeerName | Beer_ID | ABV | IBU | Style | Ounces | BrewerName | City | State |
|---|---|---|---|---|---|---|---|---|---|
| 1 | Get Together | 2692 | 0.045 | 50 | American IPA | 16 | NorthGate Brewing | Minneapolis | MN |
| 1 | Maggie’s Leap | 2691 | 0.049 | 26 | Milk / Sweet Stout | 16 | NorthGate Brewing | Minneapolis | MN |
| 1 | Wall’s End | 2690 | 0.048 | 19 | English Brown Ale | 16 | NorthGate Brewing | Minneapolis | MN |
| 1 | Pumpion | 2689 | 0.060 | 38 | Pumpkin Ale | 16 | NorthGate Brewing | Minneapolis | MN |
| 1 | Stronghold | 2688 | 0.060 | 25 | American Porter | 16 | NorthGate Brewing | Minneapolis | MN |
| 1 | Parapet ESB | 2687 | 0.056 | 47 | Extra Special / Strong Bitter (ESB) | 16 | NorthGate Brewing | Minneapolis | MN |
kable(tail(merged_beers, 6))
| Brewery_id | BeerName | Beer_ID | ABV | IBU | Style | Ounces | BrewerName | City | State | |
|---|---|---|---|---|---|---|---|---|---|---|
| 2405 | 556 | Pilsner Ukiah | 98 | 0.055 | NA | German Pilsener | 12 | Ukiah Brewing Company | Ukiah | CA |
| 2406 | 557 | Heinnieweisse Weissebier | 52 | 0.049 | NA | Hefeweizen | 12 | Butternuts Beer and Ale | Garrattsville | NY |
| 2407 | 557 | Snapperhead IPA | 51 | 0.068 | NA | American IPA | 12 | Butternuts Beer and Ale | Garrattsville | NY |
| 2408 | 557 | Moo Thunder Stout | 50 | 0.049 | NA | Milk / Sweet Stout | 12 | Butternuts Beer and Ale | Garrattsville | NY |
| 2409 | 557 | Porkslap Pale Ale | 49 | 0.043 | NA | American Pale Ale (APA) | 12 | Butternuts Beer and Ale | Garrattsville | NY |
| 2410 | 558 | Urban Wilderness Pale Ale | 30 | 0.049 | NA | English Pale Ale | 12 | Sleeping Lady Brewing Company | Anchorage | AK |
As requested, the R Mustangs team merged the data sets provided. Although we found some gaps in the data, we were successful in pulling all data together for analysis.
Displayed above are various Exploratory Data Analysis (EDA) techniqiues we employed on the data to validate and understand what has been provided. Directly above, the first and last 6 lines of the merged dataset is provided for reference and understanding.
#Question 3 - We will find out how many NAs are in each column for the merged dataset,
#so we can report it
kable(colSums(is.na(merged_beers)))
| x | |
|---|---|
| Brewery_id | 0 |
| BeerName | 0 |
| Beer_ID | 0 |
| ABV | 62 |
| IBU | 1005 |
| Style | 5 |
| Ounces | 0 |
| BrewerName | 0 |
| City | 0 |
| State | 0 |
missing_values <- colSums(is.na(merged_beers))
missing_values <- as.data.frame(missing_values)
names(missing_values)[1] <- "MissingValues"
missing_values<-missing_values %>% rownames_to_column("Variable")
ggplot(data = missing_values, aes(x=Variable, y=MissingValues, fill=MissingValues)) +
geom_bar(stat = "identity", colour="black") +
geom_text(aes(label=MissingValues, size=5, vjust = -0.5, hjust= 0.5), show.legend = FALSE) +
ggtitle("Missing Values") +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x="Variables", y="MIssing Values") +
guides(fill = guide_legend(title = "Dataset Missing Values")) +
theme(plot.title = element_text(hjust = 0.5, face="bold")) +
theme(axis.title=element_text(face="bold"))
Shown above is a “Missing Values” report that shows where we have gaps in the existing data. While not ideal, we can still proceed with analysis and proceed with caution under this understanding. For the future, we would recommend review on data quality procedures in order to ensure the lineage and sanctity of the data sets wherever possible.
#Question 4 - We will compute the median alcohol content and international bitterness unit by state
#then plot the results
(medians <- merged_beers %>%
group_by(State) %>%
summarise(MedianABV = median(ABV, na.rm = TRUE), MedianIBU = median(IBU, na.rm = TRUE)))
## # A tibble: 51 x 3
## State MedianABV MedianIBU
## <chr> <dbl> <dbl>
## 1 AK 0.056 46
## 2 AL 0.06 43
## 3 AR 0.052 39
## 4 AZ 0.055 20.5
## 5 CA 0.058 42
## 6 CO 0.0605 40
## 7 CT 0.06 29
## 8 DC 0.0625 47.5
## 9 DE 0.055 52
## 10 FL 0.057 55
## 11 GA 0.055 55
## 12 HI 0.054 22.5
## 13 IA 0.0555 26
## 14 ID 0.0565 39
## 15 IL 0.058 30
## 16 IN 0.058 33
## 17 KS 0.05 20
## 18 KY 0.0625 31.5
## 19 LA 0.052 31.5
## 20 MA 0.054 35
## 21 MD 0.058 29
## 22 ME 0.051 61
## 23 MI 0.062 35
## 24 MN 0.056 44.5
## 25 MO 0.052 24
## 26 MS 0.058 45
## 27 MT 0.055 40
## 28 NC 0.057 33.5
## 29 ND 0.05 32
## 30 NE 0.056 35
## 31 NH 0.055 48.5
## 32 NJ 0.046 34.5
## 33 NM 0.062 51
## 34 NV 0.06 41
## 35 NY 0.055 47
## 36 OH 0.058 40
## 37 OK 0.06 35
## 38 OR 0.056 40
## 39 PA 0.057 30
## 40 RI 0.055 24
## 41 SC 0.055 30
## 42 SD 0.06 NA
## 43 TN 0.0570 37
## 44 TX 0.055 33
## 45 UT 0.04 34
## 46 VA 0.0565 42
## 47 VT 0.055 30
## 48 WA 0.0555 38
## 49 WI 0.052 19
## 50 WV 0.062 57.5
## 51 WY 0.05 21
#Plot ABV graph
#Show ggplot. Center title, with axis titles
ggplot(data = medians, aes(x=medians$State, y=medians$MedianABV, fill=medians$MedianABV)) +
geom_bar(stat = "identity", colour="black") +
scale_fill_gradient2(labels = percent, midpoint=median(medians$MedianABV, na.rm = TRUE),low='#deebf7', mid='#9ecae1', high='#3182bd', space='Lab', aesthetics = "fill") +
scale_y_continuous(labels = scales::percent, limits = c(NA, max(medians$MedianABV)+0.005)) +
##sec.axis = sec_axis(~. * 25, name = "MedianIBU") +
ggtitle("Median ABV By State") +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x="State", y="Median ABV in %") +
guides(fill = guide_legend(title = "Median ABV")) +
theme(plot.title = element_text(hjust = 0.5, face="bold")) +
theme(axis.title=element_text(face="bold"))
#Plot ABV sort graph
#Show ggplot. Center title, with axis titles
ggplot(data = medians, aes(x=reorder(medians$State, medians$MedianABV), y=medians$MedianABV, fill=medians$MedianABV)) +
geom_bar(stat = "identity", colour="black") +
scale_fill_gradient2(labels = percent, midpoint=median(medians$MedianABV, na.rm = TRUE),low='#deebf7', mid='#9ecae1', high='#3182bd', space='Lab', aesthetics = "fill") +
scale_y_continuous(labels = scales::percent, limits = c(NA, max(medians$MedianABV)+0.005)) +
##sec.axis = sec_axis(~. * 25, name = "MedianIBU") +
ggtitle("Median ABV By State") +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x="State", y="Median ABV in %") +
guides(fill = guide_legend(title = "Median ABV")) +
theme(plot.title = element_text(hjust = 0.5, face="bold")) +
theme(axis.title=element_text(face="bold"))
#Set chart title for mean
mean_ABV <- paste0("ABV Histogram: Mean = ",round(mean(merged_beers$ABV, na.rm = TRUE), digits = 3)*100, "%")
#Show Histogram to show distribution of values for ABV
ggplot(data=merged_beers, aes(merged_beers$ABV, fill=..count..)) +
geom_histogram(binwidth = .005) +
geom_vline(aes(xintercept = mean(merged_beers$ABV, na.rm = TRUE)),col='black', size=2) +
labs(x="ABV", y="Frequency") +
guides(fill = guide_legend(title = "ABV Frequency")) +
ggtitle(mean_ABV) +
theme(plot.title = element_text(hjust = 0.5, size = 15, face="bold")) +
theme(axis.text = element_text(size=15), axis.title=element_text(size=15,face="bold"))
#Plot IBU graph
ggplot(data = medians, aes(x=medians$State, y=medians$MedianIBU, fill=medians$MedianIBU)) +
geom_bar(stat = "identity", colour="black") +
scale_fill_gradient2(midpoint=median(medians$MedianIBU, na.rm = TRUE),low='#deebf7', mid='#9ecae1', high='#3182bd', space='Lab', aesthetics = "fill") +
scale_y_continuous(limits = c(NA, max(medians$MedianIBU)+0.005)) +
ggtitle("Median IBU By State") +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x="State", y="Median IBU") +
guides(fill = guide_legend(title = "Median IBU")) +
theme(plot.title = element_text(hjust = 0.5, face="bold")) +
theme(axis.title=element_text(face="bold"))
#Plot IBU sort graph
ggplot(data = medians, aes(x=reorder(medians$State, medians$MedianIBU), y=medians$MedianIBU, fill=medians$MedianIBU)) +
geom_bar(stat = "identity", colour="black") +
scale_fill_gradient2(midpoint=median(medians$MedianIBU, na.rm = TRUE),low='#deebf7', mid='#9ecae1', high='#3182bd', space='Lab', aesthetics = "fill") +
scale_y_continuous(limits = c(NA, max(medians$MedianIBU)+0.005)) +
ggtitle("Median IBU By State") +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x="State", y="Median IBU") +
guides(fill = guide_legend(title = "Median IBU")) +
theme(plot.title = element_text(hjust = 0.5, face="bold")) +
theme(axis.title=element_text(face="bold"))
#Plot IBU sort graph
#Draw a line at 50 so we can determine High Versus Low
ggplot(data = medians, aes(x=reorder(medians$State, medians$MedianIBU), y=medians$MedianIBU, fill=medians$MedianIBU)) +
geom_bar(stat = "identity", colour="black") +
geom_abline(intercept = 50, slope = 0, na.rm=TRUE, linetype="dashed", size=1.5, col = "Red3") +
scale_fill_gradient2(midpoint=median(medians$MedianIBU, na.rm = TRUE),low='#deebf7', mid='#9ecae1', high='#3182bd', space='Lab', aesthetics = "fill") +
scale_y_continuous(limits = c(NA, max(medians$MedianIBU)+0.005)) +
ggtitle("Median IBU By State") +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x="State", y="Median IBU") +
guides(fill = guide_legend(title = "Median IBU")) +
theme(plot.title = element_text(hjust = 0.5, face="bold")) +
theme(axis.title=element_text(face="bold")) +
annotate("text", x=5, y=30, label = "<50 IBU = Malty", size=5, fontface = "bold", col="Red3") +
annotate("text", x=5, y=80, label = ">50 IBU = Hoppy", size=5, fontface = "bold", col = "Green4")
#Set chart title for mean
mean_IBU <- paste0("IBU Histogram: Mean = ",round(mean(merged_beers$IBU, na.rm = TRUE), digits = 0))
#Show Histogram to show distribution of values for IBU
ggplot(data=merged_beers, aes(merged_beers$IBU, fill=..count..)) +
geom_histogram(binwidth = 5) +
geom_vline(aes(xintercept = mean(merged_beers$IBU, na.rm = TRUE)),col='black',size=2) +
theme(plot.title = element_text(hjust = 0.5, size = 15, face="bold")) +
theme(axis.text = element_text(size=15), axis.title=element_text(size=15,face="bold")) +
labs(x="IBU Frequency", y="Frequency") +
guides(fill = guide_legend(title = "IBU")) +
ggtitle(mean_IBU) +
theme(plot.title = element_text(hjust = 0.5, size = 15, face="bold")) +
theme(axis.text = element_text(size=15), axis.title=element_text(size=15,face="bold"))
Displayed above are various graphs of both ABV and IBU as they relate to the United States. Please note the sorted graphs are the most informative for this exercise as they will help distinguish which states have what types of preferences for their locality. Alcohol content and bitterness preference vary in terms of regional tastes.
#Question 5 - Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer?
#Lets find out explictly which state has the highest ABV and IBU
#We will answer this with explicit data pulls below, plus graphically with existing charts
#State with max ABV
kable(merged_beers %>%
filter(ABV == max(ABV, na.rm=TRUE)))
| Brewery_id | BeerName | Beer_ID | ABV | IBU | Style | Ounces | BrewerName | City | State |
|---|---|---|---|---|---|---|---|---|---|
| 52 | Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale | 2565 | 0.128 | NA | Quadrupel (Quad) | 19.2 | Upslope Brewing Company | Boulder | CO |
#Top 5 with max absolute ABV
kable(head(merged_beers[order(-merged_beers$ABV),], 5))
| Brewery_id | BeerName | Beer_ID | ABV | IBU | Style | Ounces | BrewerName | City | State | |
|---|---|---|---|---|---|---|---|---|---|---|
| 375 | 52 | Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale | 2565 | 0.128 | NA | Quadrupel (Quad) | 19.2 | Upslope Brewing Company | Boulder | CO |
| 8 | 2 | London Balling | 2685 | 0.125 | 80 | English Barleywine | 16.0 | Against the Grain Brewery | Louisville | KY |
| 144 | 18 | Csar | 2621 | 0.120 | 90 | Russian Imperial Stout | 16.0 | Tin Man Brewing Company | Evansville | IN |
| 376 | 52 | Lee Hill Series Vol. 4 - Manhattan Style Rye Ale | 2564 | 0.104 | NA | Rye Beer | 19.2 | Upslope Brewing Company | Boulder | CO |
| 336 | 47 | 4Beans | 2574 | 0.100 | 52 | Baltic Porter | 12.0 | Sixpoint Craft Ales | Brooklyn | NY |
#State with max IBU
merged_beers %>%
filter(IBU == max(IBU, na.rm=TRUE))
## Brewery_id BeerName Beer_ID ABV IBU
## 1 375 Bitter Bitch Imperial IPA 980 0.082 138
## Style Ounces BrewerName City
## 1 American Double / Imperial IPA 12 Astoria Brewing Company Astoria
## State
## 1 OR
#Top 5 with max absolute IBU
kable(head(merged_beers[order(-merged_beers$IBU),], 5))
| Brewery_id | BeerName | Beer_ID | ABV | IBU | Style | Ounces | BrewerName | City | State | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1857 | 375 | Bitter Bitch Imperial IPA | 980 | 0.082 | 138 | American Double / Imperial IPA | 12 | Astoria Brewing Company | Astoria | OR |
| 1719 | 345 | Troopers Alley IPA | 1676 | 0.059 | 135 | American IPA | 12 | Wolf Hills Brewing Company | Abingdon | VA |
| 1305 | 231 | Dead-Eye DIPA | 2067 | 0.090 | 130 | American Double / Imperial IPA | 16 | Cape Ann Brewing Company | Gloucester | MA |
| 625 | 100 | Bay of Bengal Double IPA (2014) | 2440 | 0.089 | 126 | American Double / Imperial IPA | 12 | Christian Moerlein Brewing Company | Cincinnati | OH |
| 431 | 62 | Abrasive Ale | 15 | 0.097 | 120 | American Double / Imperial IPA | 16 | Surly Brewing Company | Brooklyn Center | MN |
# MAX MEDIAN ABV STATES
kable(head(medians[order(-medians$MedianABV),], 5))
| State | MedianABV | MedianIBU |
|---|---|---|
| DC | 0.0625 | 47.5 |
| KY | 0.0625 | 31.5 |
| MI | 0.0620 | 35.0 |
| NM | 0.0620 | 51.0 |
| WV | 0.0620 | 57.5 |
# MAX MEDIAN IBU STATES
kable(head(medians[order(-medians$MedianIBU),], 5))
| State | MedianABV | MedianIBU |
|---|---|---|
| ME | 0.051 | 61.0 |
| WV | 0.062 | 57.5 |
| FL | 0.057 | 55.0 |
| GA | 0.055 | 55.0 |
| DE | 0.055 | 52.0 |
The maximum alcohol content (ABV) beer is a beer from Colorado named the Lee Hill Series Vol 5 - Belgian Style Quadrupel Ale, from a brewer named Upslope Breweing Company. This beer has an ABV of 12.8%, which represents an outlier in terms of the overall sample of craft beers we reviewed.
The maximum bitterness (IBU) beer is a beer from Oregon named the Bitter Bitch Imperial IPA, from a brewer by the name of Astoria Brewing Company. Their reported IBU for that beer is 138, which also represents a very high number. This also may represent suspect data as the IBU rating is a rating that measures the amount of isomerized alpha acids in a beer. Humans can only detect up to 100, which is why typically IBU is reported on a 0 to 100 scale. We merely recommend keeping this in mind for the upper echelons of IBU.
There is nothing particularly wrong the reported 138 IBU as its still representative of hops alpha acids for the beer. However, for future research we need to keep in mind the upper bound of 100 for taste preferences. Its still correct to report 138 IBU as it relates to alcohol content as we will see later in the analysis.
For comprehensiveness we also find above the max Median ABV and max Median IBU states as well. District of Columbia and Kentucky are the maximum Median ABV territory and State (6.25% Alcohol), and Maine is the maximum Median IBU State (61 IBU), followed by West Virgina (57 IBU). This is also displayed above in tabular form.
#Question 6 - We will aggregate summary statistics for ABV
#Summary for ABV Stats
kable(as.array(summary(merged_beers$ABV)))
| Var1 | Freq |
|---|---|
| Min. | 0.0010000 |
| 1st Qu. | 0.0500000 |
| Median | 0.0560000 |
| Mean | 0.0597734 |
| 3rd Qu. | 0.0670000 |
| Max. | 0.1280000 |
| NA’s | 62.0000000 |
describe(merged_beers$ABV)
## merged_beers$ABV
## n missing distinct Info Mean Gmd .05 .10
## 2348 62 74 0.998 0.05977 0.01469 0.042 0.045
## .25 .50 .75 .90 .95
## 0.050 0.056 0.067 0.080 0.087
##
## lowest : 0.001 0.027 0.028 0.032 0.034, highest: 0.100 0.104 0.120 0.125 0.128
kable(stat.desc(merged_beers$ABV))
| x | |
|---|---|
| nbr.val | 2348.0000000 |
| nbr.null | 0.0000000 |
| nbr.na | 62.0000000 |
| min | 0.0010000 |
| max | 0.1280000 |
| range | 0.1270000 |
| sum | 140.3480000 |
| median | 0.0560000 |
| mean | 0.0597734 |
| SE.mean | 0.0002795 |
| CI.mean.0.95 | 0.0005480 |
| var | 0.0001834 |
| std.dev | 0.0135417 |
| coef.var | 0.2265511 |
#We will do IBU for good measure
kable(as.array(summary(merged_beers$IBU)))
| Var1 | Freq |
|---|---|
| Min. | 4.00000 |
| 1st Qu. | 21.00000 |
| Median | 35.00000 |
| Mean | 42.71317 |
| 3rd Qu. | 64.00000 |
| Max. | 138.00000 |
| NA’s | 1005.00000 |
describe(merged_beers$IBU)
## merged_beers$IBU
## n missing distinct Info Mean Gmd .05 .10
## 1405 1005 107 0.999 42.71 28.81 11 15
## .25 .50 .75 .90 .95
## 21 35 64 80 92
##
## lowest : 4 5 6 7 8, highest: 120 126 130 135 138
kable(stat.desc(merged_beers$IBU))
| x | |
|---|---|
| nbr.val | 1.405000e+03 |
| nbr.null | 0.000000e+00 |
| nbr.na | 1.005000e+03 |
| min | 4.000000e+00 |
| max | 1.380000e+02 |
| range | 1.340000e+02 |
| sum | 6.001200e+04 |
| median | 3.500000e+01 |
| mean | 4.271317e+01 |
| SE.mean | 6.924162e-01 |
| CI.mean.0.95 | 1.358282e+00 |
| var | 6.736135e+02 |
| std.dev | 2.595407e+01 |
| coef.var | 6.076362e-01 |
Summary and descritive statistics we aggregated for both ABV and IBU. This is covered in detail in our presentation, and is displayed above. This data, in combination with the histgrams generated, cover the full shape and behavior of the data provided.
#Question 7 - There seems to be a linear relation to ABV and IBU. We will draw a scatter plot
#Plus, we will find the linear regression line fit so we can predict missing values
reg<-lm(ABV ~ IBU, data = merged_beers)
reg
##
## Call:
## lm(formula = ABV ~ IBU, data = merged_beers)
##
## Coefficients:
## (Intercept) IBU
## 0.0449303 0.0003508
(r_squared<-summary(lm(ABV~IBU, merged_beers))$r.squared)
## [1] 0.4497332
# Equation of the line :
coeff=coefficients(reg)
eq = paste0("y = ", round(reg$coefficients[2],5)*100, "*x + ", reg$coefficients[1]*100,1)
scatter_title <- paste0("IBU vs ABV Relationship", "\n\n", eq)
#Asside: to find values where we have IBU but do not have ABV
predicted_values_IBU <- merged_beers %>%
filter(is.na(merged_beers$ABV) == FALSE, is.na(merged_beers$IBU) == TRUE)
predicted_values_IBU$IBU <- ifelse((predicted_values_IBU$ABV-reg$coefficients[1])/reg$coefficients[2]>0,
(predicted_values_IBU$ABV-reg$coefficients[1])/reg$coefficients[2],
NA)
#Plot scatter plot of ABV vs IBU, with prediected values
ggplot(data=merged_beers, aes(x=IBU, y=ABV, color=State)) +
geom_jitter() +
geom_abline(intercept = reg$coefficients[1], slope = reg$coefficients[2], na.rm=TRUE, linetype="dashed") +
ggtitle(eq) +
theme(plot.title = element_text(hjust = 1)) +
scale_y_continuous(labels = scales::percent, limits = c(NA, max(merged_beers$ABV)+0.005)) +
ggtitle(scatter_title) +
theme(plot.title = element_text(hjust = 0.5, size = 15, face="bold")) +
theme(axis.text = element_text(size=15), axis.title=element_text(size=15,face="bold")) +
geom_point(data = predicted_values_IBU,
mapping = aes(x = IBU, y = ABV, shape=State),
shape = 0,
size =4,
show.legend=TRUE) +
labs(caption = "Square datapoints represent predicted calculated values where they were missing.")
As shown above, there does seem to be a linear relationship to ABV and IBU, as mentioned earlier. Utilizing a linear regression we found the relation with an R Squared = 0.4497332. This says that the fit only explains 45% of the variability around the mean.
Our conclusion based on this cursory analysis is that IBU is an ok proxy for predicting Alcohol content, however it does not predict a preponderance of the cases precisely.
#####################################
## THIS IS EXTRA ANALYSIS TO CONSIDER FOR PRESENTATION AND CASE
#####################################
## IN ORDER TO ROUND OUT THE DISCUSSION FURTHER DATA WAS PULLED
## IN TO ENRICH THE PROVIDED DATASETS
## WE UTILIZE BOTH ALCOHOL CONSUMPTION DATA FROM OPEN ICPSR
## AS WELL AS US CENSUS BUREAU DATA
#####################################
# https://www.openicpsr.org/openicpsr/project/105583/version/V2/view;jsessionid=843DFC2FBDC320D1624BF92319E643FA?path=/openicpsr/105583/fcr:versions/V2/apparent_per_capita_alcohol_consumption.csv&type
##
##https://www2.census.gov/programs-surveys/popproj/datasets/2017/2017-popproj/np2017_d1.csv
#####################################
#Asside: to find na values for style
merged_beers %>%
filter(is.na(merged_beers$Style) == TRUE)
## Brewery_id BeerName Beer_ID ABV IBU Style Ounces
## 1 30 Special Release 2210 NA NA <NA> 16
## 2 67 OktoberFiesta 2527 0.053 27 <NA> 12
## 3 161 Kilt Lifter Scottish-Style Ale 1635 0.060 21 <NA> 12
## 4 167 The CROWLER™ 1796 NA NA <NA> 32
## 5 167 CAN'D AID Foundation 1790 NA NA <NA> 12
## BrewerName City State
## 1 Cedar Creek Brewery Seven Points TX
## 2 Freetail Brewing Company San Antonio TX
## 3 Four Peaks Brewing Company Tempe AZ
## 4 Oskar Blues Brewery Longmont CO
## 5 Oskar Blues Brewery Longmont CO
#####################################
## CREATE MEDIAN ABV MAPPING
#####################################
#rename column in plot_beers for simplicity for mapping
plot_beers <- as_tibble(medians)
plot_beers <- rename(plot_beers, abbr=State)
plot_beers <- left_join(plot_beers, statepop, by = c("abbr" = "abbr"))
#fill in missing values NA=0
#plot_beers[is.na(plot_beers)] <- 0 #<-----Uncomment to place zeros on IBU clorograph
#create a percent column for ABV
plot_beers$ABV_Percent <- plot_beers$MedianABV*100
#lets make a map
usa <- map_data("usa")
w2hr <- map_data("world2Hires")
states <- map_data("state")
#create cap function
simpleCap <- function(x) {
s <- strsplit(x, " ")[[1]]
paste(toupper(substring(s, 1,1)), substring(s, 2),
sep="", collapse=" ")
}
#import states
states$region<-sapply(states$region, simpleCap)
#join for lats and longs
plot_beers_states <- as_tibble(inner_join(states, plot_beers, by=c("region" = "full")))
#find centroid of state
mean_geo <- as_tibble(plot_beers_states %>%
group_by(region) %>%
summarise(lat=mean(c(max(lat), min(lat))), long=mean(c(max(long), min(long)))))
#joins centroids to original data
plot_beers <-inner_join(plot_beers, mean_geo, by=c("full" = "region"))
#add offsets so I can adjust labeling on the map
plot_beers$offset <- ifelse(plot_beers$full %in% c("New Hampshire",
#"Vermont",
"Massachusetts",
"Rhode Island",
"New York",
"New Jersey",
"Delaware"), 1, 0)
#add in brewer counts
plot_beers <- inner_join(plot_beers, breweries_count, by=c("abbr" = "State"))
names(plot_beers)[11] <- "count_breweries"
#add in brew counts
plot_beers <- inner_join(plot_beers, brews_count, by=c("abbr" = "State"))
names(plot_beers)[12] <- "count_brews"
#create a base map
us_base <- ggplot(data = plot_beers_states) +
geom_polygon(aes(x = long, y = lat, group = group), color = "white") +
coord_fixed(1.3) #+
#cut out the fluff so I can add it back piece by piece
ditch_the_axes <- theme(
axis.text = element_blank(),
axis.line = element_blank(),
axis.ticks = element_blank(),
panel.border = element_blank(),
panel.grid = element_blank(),
axis.title = element_blank()
)
#####################################
## MAP ABV
#####################################
#create Median % ABV Map
p <- us_base +
geom_polygon(aes(x = long, y = lat, group = group, fill = MedianABV), color = "white") +
scale_fill_gradient(low='#d7edfe', high='#23619f', labels = scales::percent) +
geom_polygon(aes(x = long, y = lat, group = group), color = "white", fill = NA) +
theme_bw() +
ditch_the_axes +
geom_text(data=subset(plot_beers, offset==0), aes(x = long, y = lat, label = paste(as.character(MedianABV*100), "%")), color="black")+
geom_text_repel(data=subset(plot_beers, offset==1), aes(x = long, y = lat, label = paste(as.character(MedianABV*100), "%")),
box.padding = unit(0.4, "lines"),
point.padding = unit(0.2, "lines"),
direction="x",
nudge_x = 0.7) +
labs(title = "Median Percent ABV By State") +
theme(plot.title = element_text(hjust = 0.5)) +
guides(fill = guide_legend(title = "Percent ABV"))
p
#####################################
## CREATE MEDIAN IBU MAPPING
#####################################
#added code here to modify NA labels to "No Data" for clarity
p <- us_base +
geom_polygon(aes(x = long, y = lat, group = group, fill = MedianIBU), color = "white") +
scale_fill_gradient(low='#d7edfe', high='#23619f', na.value="white")+
geom_polygon(aes(x = long, y = lat, group = group), color = "white", fill = NA) +
theme_bw() +
ditch_the_axes +
geom_text(data=subset(plot_beers, offset==0), aes(x = long, y = lat, label = paste(as.character(ifelse(is.na(MedianIBU)==TRUE, "No Data", MedianIBU)))), color="black") +
geom_text_repel(data=subset(plot_beers, offset==1), aes(x = long, y = lat, label = paste(as.character(MedianIBU))),
box.padding = unit(0.4, "lines"),
point.padding = unit(0.2, "lines"),
direction="x",
nudge_x = 0.7) +
labs(title = "Median IBU By State") +
theme(plot.title = element_text(hjust = 0.5)) +
guides(fill = guide_legend(title = "IBU"))
p
#####################################
## CREATE BREWERY MAPPING
#####################################
#join for lats and longs
plot_beers_states <- as_tibble(inner_join(states, plot_beers[,c(5,11)], by=c("region" = "full")))
#create a base map
us_base <- ggplot(data = plot_beers_states) +
geom_polygon(aes(x = long, y = lat, group = group), color = "white") +
coord_fixed(1.3) #+
p <- us_base +
geom_polygon(aes(x = long, y = lat, group = group, fill = count_breweries), color = "white") +
scale_fill_gradient(low='#d7edfe', high='#23619f', na.value="white")+
geom_polygon(aes(x = long, y = lat, group = group), color = "white", fill = NA) +
theme_bw() +
ditch_the_axes +
geom_text(data=subset(plot_beers, offset==0), aes(x = long, y = lat, label = paste(as.character(count_breweries))), color="black")+
geom_text_repel(data=subset(plot_beers, offset==1), aes(x = long, y = lat, label = paste(as.character(count_breweries))),
box.padding = unit(0.4, "lines"),
point.padding = unit(0.2, "lines"),
direction="x",
nudge_x = 0.7) +
labs(title = "Breweries By State") +
theme(plot.title = element_text(hjust = 0.5)) +
guides(fill = guide_legend(title = "Breweries"))
p
#####################################
## CREATE BREWS MAPPING
#####################################
#join for lats and longs
plot_beers_states <- as_tibble(inner_join(states, plot_beers[,c(5,12)], by=c("region" = "full")))
#create a base map
us_base <- ggplot(data = plot_beers_states) +
geom_polygon(aes(x = long, y = lat, group = group), color = "white") +
coord_fixed(1.3) #+
p <- us_base +
geom_polygon(aes(x = long, y = lat, group = group, fill = count_brews), color = "white") +
scale_fill_gradient(low='#d7edfe', high='#23619f', na.value="white")+
geom_polygon(aes(x = long, y = lat, group = group), color = "white", fill = NA) +
theme_bw() +
ditch_the_axes +
geom_text(data=subset(plot_beers, offset==0), aes(x = long, y = lat, label = paste(as.character(count_brews))), color="black")+
geom_text_repel(data=subset(plot_beers, offset==1), aes(x = long, y = lat, label = paste(as.character(count_brews))),
box.padding = unit(0.4, "lines"),
point.padding = unit(0.2, "lines"),
direction="x",
nudge_x = 0.7) +
labs(title = "Craft Beers By State") +
theme(plot.title = element_text(hjust = 0.5)) +
guides(fill = guide_legend(title = "Craft Beers"))
p
#####################################
## IMPORT CENSUS DATA & CONSUMPTION
#####################################
#setwd("/Users/chandlervaughn/Dropbox/4. Chandler/Development/git_repositories/r_mustangs/Case1")
setwd(root_dir)
#lets get population information for 201x from US Census
if(!file.exists("Data/population.csv")){
res <- tryCatch(download.file(url="https://www2.census.gov/programs-surveys/popest/datasets/2010-2018/national/totals/nst-est2018-popchg2010_2018.csv",
destfile="Data/population.csv",
method="libcurl"),
error=function(e) 1)
}
#download.file(url="https://www2.census.gov/programs-surveys/popest/datasets/2010-2018/national/totals/nst-est2018-popchg2010_2018.csv", destfile="Data/population.csv", method="libcurl")
population <- read_csv("Data/population.csv")
#download.file(url="https://www2.census.gov/programs-surveys/popest/datasets/2010-2018/national/totals/nst-est2018-popchg2010_2018.csv", destfile="Data/population.csv", method="libcurl")
population <- read_csv("Data/population.csv")
#fix State Capitalization
population$NAME<-sapply(population$NAME, simpleCap)
#fix DC
population$NAME<-ifelse(population$NAME =="District of Columbia", "DC", population$NAME)
#remove unneeded data
remove <- with(population, (NAME == "Northeast Region") | (NAME == "Midwest Region") | (NAME == "South Region") | (NAME == "West Region") | (NAME == "United States") | (NAME == "Puerto Rico") )
population<-population[!remove,]
#lets get consumption data for 2016 - pre downloaded from authorized site
#@misc{openICPS65:online,
#howpublished = {\url{https://www.openicpsr.org/openicpsr/project/105583/version/V2/download/terms?path=/openicpsr/105583/fcr:versions/V2&type=project}},
#note = {(Accessed on 02/19/2019)}
#}
consumption <- read_csv("Data/apparent_per_capita_alcohol_consumption.csv",
col_type = cols(
state = col_character(),
year = col_double(),
ethanol_beer_gallons_per_capita = col_double(),
ethanol_wine_gallons_per_capita = col_double(),
ethanol_spirit_gallons_per_capita = col_double(),
ethanol_all_drinks_gallons_per_capita = col_double(),
number_of_beers = col_double(),
number_of_glasses_wine = col_double(),
number_of_shots_liquor = col_double(),
number_of_drinks_total = col_double()
)
)
#fix State Capitalization
consumption$state<-sapply(consumption$state, simpleCap)
#fix DC
consumption$state<-ifelse(consumption$state =="District Of Columbia", "DC", consumption$state)
#remove unneeded data
remove <- with(consumption, (state == "Northeast Region") | (state == "Midwest Region") | (state == "South Region") | (state == "West Region") | (state == "Us Total") )
consumption<-consumption[!remove,]
#find 2016
consump_2016<-filter(consumption, near(year,2016))
#lets pull in population by state
consump_2016<-inner_join(consump_2016, population, by=c("state"="NAME"))
gallons_beer_by_state <- consump_2016 %>%
group_by(state) %>%
summarise(gallons_beer = ethanol_beer_gallons_per_capita*POPESTIMATE2016)#, beer_per_capita=ethanol_beer_gallons_per_capita)
#add in gallons counts
plot_beers <- inner_join(plot_beers, gallons_beer_by_state, by=c("full" = "state"))
#names(plot_beers)[13] <- "gallons_beer"
#plot_beers <- inner_join(plot_beers, gallons_beer_by_state, by=c("full" = "state"))
#names(plot_beers)[13] <- "gallons_beer"
#add in population and per capita
#join for lats and longs
plot_beers <- inner_join(plot_beers, consump_2016[,c(1,3,22)], by=c("full" = "state"))
names(plot_beers)[14] <- "gallons_beer_per_capita"
names(plot_beers)[15] <- "population_2016"
#####################################
## CREATE Gallons MAPPING
#####################################
#join for lats and longs
plot_beers_states <- as_tibble(inner_join(states, plot_beers[,c(5,13)], by=c("region" = "full")))
#create a base map
us_base <- ggplot(data = plot_beers_states) +
geom_polygon(aes(x = long, y = lat, group = group), color = "white") +
coord_fixed(1.3) #+
p <- us_base +
geom_polygon(aes(x = long, y = lat, group = group, fill = gallons_beer), color = "white") +
scale_fill_gradient(low='#d7edfe', high='#23619f', na.value="white", labels=scales::comma_format())+
geom_polygon(aes(x = long, y = lat, group = group), color = "white", fill = NA) +
theme_bw() +
ditch_the_axes +
geom_text(data=subset(plot_beers, offset==0), aes(x = long, y = lat, label = paste(as.character(round(gallons_beer/1000000, digits=2)))), color="black") +
geom_text_repel(data=subset(plot_beers, offset==1), aes(x = long, y = lat, label = paste(as.character(round(gallons_beer/1000000, digits=2)))),
box.padding = unit(0.4, "lines"),
point.padding = unit(0.2, "lines"),
direction="x",
nudge_x = 0.7) +
labs(title = "Gallons Beer Consumed By State ('000,000's)") +
theme(plot.title = element_text(hjust = 0.5)) +
guides(fill = guide_legend(title = "Gallons Beer Consumed"))
p
#####################################
## CREATE MUSTANG SCORE MAPPING
## THIS REPRESENTS THE PUNCHLINE FOR RECOMMENDATIONS ON HOPSY
#####################################
#Lets construct a scoring mechanism based on weighted averages
#Market opportunity is a weighted function of several variables
#Beer consumption in the state, the population of the state, the breweries in the state, and the beer diversity
#Along with an assumption that ABV and IBU are predictors of "at home" demand
#We will construct a score based on the sum of weighted averages for these metrics
#Scoring will be scaled from 0 --> 100 as an aggregate for all states (e.g. the sum of all state scores sum to 100)
total_pop <- sum(plot_beers$population_2016, na.rm=TRUE)
total_gallons <- sum(plot_beers$gallons_beer, na.rm=TRUE)
total_gallons_per_capita <- sum(plot_beers$gallons_beer_per_capita, na.rm=TRUE)
total_breweries <- sum(plot_beers$count_breweries, na.rm=TRUE)
total_brews <- sum(plot_beers$count_brews, na.rm=TRUE)
total_medians <- sum(plot_beers$MedianABV, na.rm=TRUE)
total_ibu <- sum(plot_beers$MedianIBU, na.rm=TRUE)
plot_beers$market_score <- 100*(1/6)*(ifelse(is.na(plot_beers$gallons_beer_per_capita)==TRUE, 0, 0.8*plot_beers$gallons_beer_per_capita)/total_gallons_per_capita +
ifelse(is.na(plot_beers$population_2016)==TRUE, 0, 1*plot_beers$population_2016)/total_pop +
ifelse(is.na(plot_beers$count_breweries)==TRUE, 0, 1*plot_beers$count_breweries)/total_breweries +
ifelse(is.na(plot_beers$count_brews)==TRUE, 0, 1*plot_beers$count_brews)/total_brews +
ifelse(is.na(plot_beers$MedianABV)==TRUE, 0, 1*plot_beers$MedianABV)/total_medians +
ifelse(is.na(plot_beers$MedianIBU)==TRUE, 0, 1*plot_beers$MedianIBU) /total_ibu)
#lets see top score areas
head(plot_beers[order(-plot_beers$market_score),], 5)
## # A tibble: 5 x 16
## abbr MedianABV MedianIBU fips full pop_2015 ABV_Percent lat long
## <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 CA 0.058 42 06 Cali… 39144818 5.8 37.3 -119.
## 2 CO 0.0605 40 08 Colo… 5456574 6.05 39.0 -106.
## 3 TX 0.055 33 48 Texas 27469114 5.5 31.2 -100.
## 4 MI 0.062 35 26 Mich… 9922576 6.2 44.6 -86.4
## 5 PA 0.057 30 42 Penn… 12802503 5.7 41.0 -77.6
## # … with 7 more variables: offset <dbl>, count_breweries <int>,
## # count_brews <int>, gallons_beer <dbl>, gallons_beer_per_capita <dbl>,
## # population_2016 <dbl>, market_score <dbl>
#join for lats and longs
plot_beers_states <- as_tibble(inner_join(states, plot_beers[,c(5,16)], by=c("region" = "full")))
#create a base map
us_base <- ggplot(data = plot_beers_states) +
geom_polygon(aes(x = long, y = lat, group = group), color = "white") +
coord_fixed(1.3) #+
p <- us_base +
geom_polygon(aes(x = long, y = lat, group = group, fill = market_score), color = "white") +
scale_fill_gradient(low='#d7edfe', high='#23619f', na.value="white", labels=scales::comma_format())+
geom_polygon(aes(x = long, y = lat, group = group), color = "white", fill = NA) +
theme_bw() +
ditch_the_axes +
geom_text(data=subset(plot_beers, offset==0), aes(x = long, y = lat, label = paste(as.character(round(market_score, digits=2)))), color="black") +
geom_text_repel(data=subset(plot_beers, offset==1), aes(x = long, y = lat, label = paste(as.character(round(market_score, digits=2)))),
box.padding = unit(0.4, "lines"),
point.padding = unit(0.2, "lines"),
direction="x",
nudge_x = 0.7) +
labs(title = paste0("Mustang Score","\n","(All Scores Sum to 100)", "\n", "Equal Weights")) +
theme(plot.title = element_text(hjust = 0.5)) +
guides(fill = guide_legend(title = paste0("Mustang Score","\n","(Higher Is Better)")))
p
#####################################
## CREATE NEW GRAPHIC WITH DOUBLE WEIGHTED IBU
#####################################
plot_beers$market_score <- 100*(1/6)*(ifelse(is.na(plot_beers$gallons_beer_per_capita)==TRUE, 0, 0.8*plot_beers$gallons_beer_per_capita)/total_gallons_per_capita +
ifelse(is.na(plot_beers$population_2016)==TRUE, 0, 0.8*plot_beers$population_2016)/total_pop +
ifelse(is.na(plot_beers$count_breweries)==TRUE, 0, 0.8*plot_beers$count_breweries)/total_breweries +
ifelse(is.na(plot_beers$count_brews)==TRUE, 0, 0.8*plot_beers$count_brews)/total_brews +
ifelse(is.na(plot_beers$MedianABV)==TRUE, 0, 0.8*plot_beers$MedianABV)/total_medians +
ifelse(is.na(plot_beers$MedianIBU)==TRUE, 0, 2*plot_beers$MedianIBU) /total_ibu)
#lets see top score areas
head(plot_beers[order(-plot_beers$market_score),], 5)
## # A tibble: 5 x 16
## abbr MedianABV MedianIBU fips full pop_2015 ABV_Percent lat long
## <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 CA 0.058 42 06 Cali… 39144818 5.8 37.3 -119.
## 2 CO 0.0605 40 08 Colo… 5456574 6.05 39.0 -106.
## 3 TX 0.055 33 48 Texas 27469114 5.5 31.2 -100.
## 4 MI 0.062 35 26 Mich… 9922576 6.2 44.6 -86.4
## 5 FL 0.057 55 12 Flor… 20271272 5.7 28.1 -83.8
## # … with 7 more variables: offset <dbl>, count_breweries <int>,
## # count_brews <int>, gallons_beer <dbl>, gallons_beer_per_capita <dbl>,
## # population_2016 <dbl>, market_score <dbl>
#join for lats and longs
plot_beers_states <- as_tibble(inner_join(states, plot_beers[,c(5,16)], by=c("region" = "full")))
#create a base map
us_base <- ggplot(data = plot_beers_states) +
geom_polygon(aes(x = long, y = lat, group = group), color = "white") +
coord_fixed(1.3) #+
p <- us_base +
geom_polygon(aes(x = long, y = lat, group = group, fill = market_score), color = "white") +
scale_fill_gradient(low='#d7edfe', high='#23619f', na.value="white", labels=scales::comma_format())+
geom_polygon(aes(x = long, y = lat, group = group), color = "white", fill = NA) +
theme_bw() +
ditch_the_axes +
geom_text(data=subset(plot_beers, offset==0), aes(x = long, y = lat, label = paste(as.character(round(market_score, digits=2)))), color="black") +
geom_text_repel(data=subset(plot_beers, offset==1), aes(x = long, y = lat, label = paste(as.character(round(market_score, digits=2)))),
box.padding = unit(0.4, "lines"),
point.padding = unit(0.2, "lines"),
direction="x",
nudge_x = 0.7) +
labs(title = paste0("Mustang Score","\n","(All Scores Sum to 100)", "\n", "2x IBU Weight")) +
theme(plot.title = element_text(hjust = 0.5)) +
guides(fill = guide_legend(title = paste0("Mustang Score","\n","(Higher Is Better)")))
p
#####################################
## CREATE NEW GRAPHIC WITH DOUBLE WEIGHTED ABV
#####################################
plot_beers$market_score <- 100*(1/6)*(ifelse(is.na(plot_beers$gallons_beer_per_capita)==TRUE, 0, 0.8*plot_beers$gallons_beer_per_capita)/total_gallons_per_capita +
ifelse(is.na(plot_beers$population_2016)==TRUE, 0, 0.8*plot_beers$population_2016)/total_pop +
ifelse(is.na(plot_beers$count_breweries)==TRUE, 0, 0.8*plot_beers$count_breweries)/total_breweries +
ifelse(is.na(plot_beers$count_brews)==TRUE, 0, 0.8*plot_beers$count_brews)/total_brews +
ifelse(is.na(plot_beers$MedianABV)==TRUE, 0, 2*plot_beers$MedianABV)/total_medians +
ifelse(is.na(plot_beers$MedianIBU)==TRUE, 0, 0.8*plot_beers$MedianIBU) /total_ibu)
#lets see top score areas
head(plot_beers[order(-plot_beers$market_score),], 5)
## # A tibble: 5 x 16
## abbr MedianABV MedianIBU fips full pop_2015 ABV_Percent lat long
## <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 CA 0.058 42 06 Cali… 39144818 5.8 37.3 -119.
## 2 CO 0.0605 40 08 Colo… 5456574 6.05 39.0 -106.
## 3 TX 0.055 33 48 Texas 27469114 5.5 31.2 -100.
## 4 MI 0.062 35 26 Mich… 9922576 6.2 44.6 -86.4
## 5 PA 0.057 30 42 Penn… 12802503 5.7 41.0 -77.6
## # … with 7 more variables: offset <dbl>, count_breweries <int>,
## # count_brews <int>, gallons_beer <dbl>, gallons_beer_per_capita <dbl>,
## # population_2016 <dbl>, market_score <dbl>
#join for lats and longs
plot_beers_states <- as_tibble(inner_join(states, plot_beers[,c(5,16)], by=c("region" = "full")))
#create a base map
us_base <- ggplot(data = plot_beers_states) +
geom_polygon(aes(x = long, y = lat, group = group), color = "white") +
coord_fixed(1.3) #+
p <- us_base +
geom_polygon(aes(x = long, y = lat, group = group, fill = market_score), color = "white") +
scale_fill_gradient(low='#d7edfe', high='#23619f', na.value="white", labels=scales::comma_format())+
geom_polygon(aes(x = long, y = lat, group = group), color = "white", fill = NA) +
theme_bw() +
ditch_the_axes +
geom_text(data=subset(plot_beers, offset==0), aes(x = long, y = lat, label = paste(as.character(round(market_score, digits=2)))), color="black") +
geom_text_repel(data=subset(plot_beers, offset==1), aes(x = long, y = lat, label = paste(as.character(round(market_score, digits=2)))),
box.padding = unit(0.4, "lines"),
point.padding = unit(0.2, "lines"),
direction="x",
nudge_x = 0.7) +
labs(title = paste0("Mustang Score","\n","(All Scores Sum to 100)", "\n", "2x ABV Weight")) +
theme(plot.title = element_text(hjust = 0.5)) +
guides(fill = guide_legend(title = paste0("Mustang Score","\n","(Higher Is Better)")))
p
############################
## GENERATE ALL CODEBOOKS
############################]#####################################
## GENERATE CODEBOOKS FOR ALL RAW DATASETS
## PLEASE NOTE CODEBOOKS ONLY INCLUDE ORIGINAL DATA
## TRANSITORY AND DERIVATIVE DATASETS ARE NOT INCLUDED
#####################################
setwd(paste0(root_dir,"/Codebook"))
suppressWarnings(suppressMessages(makeCodebook(beers, codebook = TRUE, replace=TRUE)))
suppressWarnings(suppressMessages(makeCodebook(breweries, codebook = TRUE, replace=TRUE)))
suppressWarnings(suppressMessages(makeCodebook(population, codebook = TRUE, replace=TRUE)))
suppressWarnings(suppressMessages(makeCodebook(consumption, codebook = TRUE, replace=TRUE)))
setwd(root_dir)
For this study R Mustangs was tasked with analyzing both Beers and Breweries data sets for the United States. The premise of this study is that Hopsy, a premier source-to-consumer beer distributor, is expanding their Market Penetration Strategy and would like to targeted areas and for signing up new Craft Brewers. Additionally, they plan on exploring cross-state distributions opportunities. This would entail selling from their existing stable of supplier beers, to new states with high market opportunity for their Craft beer consumption.
R Mustangs was ask to:
Analyze the U.S. Craft Beer and Brewery Market data and provide insights on product and market dynamics.
Recommend target geographies which can aligned to their 2020 Market Sourcing Strategy for the existing product offerings as well as potential cross-state distribution.
Recommend a data driven targeting strategy to evaluate these questions for future growth stage activities.
While the data had some gaps (1005 missing values for IBU, 62 missing values for ABV, and 5 missing values for Styles), there was plenty of data to work with. It was found from this study that regional preferences on beer as well as State-level factors impact how many Breweries are in each state. It was also found that ABV and IBU are linearly corrolated and that IBU can serve as a relative proxy to determine overall categorical ABV percentiles.
In order to hav a fulsome conversation on Market Dynamics, the R Mustang team pulled in both Census population data and estimated per capita consumption data. This allowed us to visualize State-by-State differences. And, more importantly, it allowed us to introduce a framework for evaluating States for further discussion, prioritization, and future market penetration. We introduce the concept of the Mustang Scoring Framework, and through this initial analysis recommend that Hopsy review California, Colorado, Texas, Michigan, and Pennsylvania for the next States Hopsy should target. If those States already have penetration plans, the Mustang Scoring Framework provides a crystallized ranking system for review in order to determine the next State to target.
For future work, we would recommend analysis of pricing and incorporating that into the overall data framework. We would also recommend updating all datasets with the most current data lineages.
We thank Hopsy for the opportunity to serve them and hope that we may be of service again in the very near future.